********************************************************************************
****************************** PRE-MERGE CLEANING ***** ************************
********************************************************************************
{ /*** DO FILE DESCRIPTION ***/
/*
In this do file the AHS CAB and COMB data sets will be cleaned and prepared for merging.

Only data from the second updation round (i.e. year == 3) from the COMB dataset
	are cleaned. 
In order to use data from the baseline, the code will have to be adjusted.

Furthermore, variables from the MORT and WOMAN data sets are used in order to 
	generate mortality variables. As before, these steps would have to be adjusted 
	if the baseline (year == 1 ) is to be used.
	
Data sets used:
	state wise raw AHS CAB data sets
	state wise raw AHS COMB data sets
	state wise raw AHS WOMAN data sets
	state wise raw AHS MORT data sets
	
Data sets created:
	temporary:
		CAB_appended: appens raw CAB data sets
		CAB_state`s': these are the state wise cleaned CAB data sets that are 
			used for merging multiple temporary versions of the COMB data sets
		statewise WOMAN_mortality data sets that are used for merging
		statewise MORT_mortality data sets that are used for merging
		
		
	final:
		AHS_cab_cleaned: is the clean CAB data set that contains all AHS states
		MORT_mortality_appended contains the variables needed to calculate the 
			infant and child mortality rates
		WOMAN_mortality_appended contains the ratio of deaths of children to 
			live births for all women in a household
		CMW`s'_w3: data sets that will later be merged with the AHS CAB data set
			
Variables created:
	adultshh
age1													hyptertension treatment & diagnosis
age3													height
blood pressure variables								hypertension variables
BMI variables											pregnant
child mortality rate									infant mortality rate
childrenhh 												malehh
diabetes treatment & diagnosis							married
diabetes variables (except treatm, diagn, broad)		ratio of deceased children
dropage													
education												rural
female													state
femalehh												state_dist
glucose variables										weight
hhid													

Macros used:
	$rawcab: folder contains the raw state wise AHS CAB data sets
	$rawcomb: folder than contains the raw state wise AHS COMB data sets
	$rawmort: folder than contains the raw state wise AHS MORT data sets
	$rawwom: folder than contains the raw state wise AHS WOMAN data sets
	$temp: folder contains data sets created during the data cleaning process 
		but will not be needed later anymore
	$final: folder contains data sets that will be used later during data cleaning

*/
}
clear
version 13

global rawcab "C:\Users\Michaela\Desktop\rawdata\AHS\CAB\"
global rawcomb "C:\Users\Michaela\Desktop\rawdata\AHS\COMB\"
global rawmort "C:\Users\Michaela\Desktop\rawdata\AHS\MORT\"
global rawwom "C:\Users\Michaela\Desktop\rawdata\AHS\WOMAN\"
global temp "C:\Users\Michaela\Desktop\manipulated data\temp\"
global final "C:\Users\Michaela\Desktop\manipulated data\final\"



********************************************************************************
************************** CLEANING THE AHS CAB DATASET ************************
********************************************************************************
{ /*** APPENDING THE DATASETS OF THE INDIVIDUAL STATES ***/

use "$rawcab\05"

tostring record_code_iodine_reason, replace
tostring v54, replace

append using "$rawcab\08"
append using "$rawcab\09", force /* v54 is byte in using, however there are no observations */
append using "$rawcab\10", force /* v54 is byte in using, however there are no observations */

save "$temp\CAB_appended", replace

use "$rawcab\18"

/* 
there are two cases for which the state_code is "MARRIED" and "NOT AT HOME"
generate a new variable in case this information is needed later
*/
gen comment = ""
replace comment = "not at home" if district_code == 502 & stratum == 42
replace comment = "married" if district_code == 304 & stratum == 22

replace state_code = "18"
destring(state_code), replace
replace rural_urban = "" if rural_urban == "Y" 
destring(rural_urban), replace
tostring(v54), replace

append using "$temp\CAB_appended"

save "$temp\CAB_appended", replace

use "$rawcab\20"
replace rural_urban = "" if rural_urban == "Y"
destring(rural_urban), replace
replace state_code = 20
tostring(v54), replace

append using "$temp\CAB_appended"

append using "$rawcab\21", force /* v54 is byte in using, however it is an empty variable */
append using "$rawcab\22", force /* v54 is byte in using, however it is an empty variable */
append using "$rawcab\23", force /* v54 is byte in using, however it is an empty variable */

save "$temp\CAB_appended", replace

********************************************************************************
}


{ /*** DATA CLEANING AND GENERATING VARIABLES ***/
use "$temp\CAB_appended", clear

* unique identifier (psu_id)
	drop if psu_id == 1 // these four observations only contain missings/invalid entries
	* isid psu_id 
	
* keep only usual residents *
/* the sample will include only usual residents
*/
	keep if usual_residance == 1

	
* rename state variable 
	rename state_code state
	
* unique identifier for each district
/* two observations in Assam have a three digit district code (should be two digit)
	these district codes will be set to missing
*/
	gen state_dist = string(state, "%02.0f") + string(district_code, "%02.0f")
	order state dist state_dist
	
* rural or urban location of household 
	gen rural = (rural_urban == 1) if rural_urban != .
		
	
* generate variable for female *
	gen female = (sex == 2) if sex > 0

* Age variables 
*** based on exact age stated
/* the age unit variables are days (1-29), months (1-11) and years (-1 - 104)
	Remark: Our cutoff in DLHS is 110 years. AHS data has already been cleaned
		and the maximum value is 104. 
*/	
	gen age1 = age if age_code == "Y" | age_code == "y"
	replace age1 = 0 if age_code == "D" | age_code == "M" | age_code == "m"
	replace age1 = . if age1 < 0

*** based on date of birth
/* 
	values of date, month and year all have valid values; they all missing for 2 observations
	survey date is missing for 95 observations. All cases with missing values have 
	missings in each day, month and year.
	In 18 cases syear < byear. These cases will have missing age3 values.
 */
	rename year_of_birth byear
	rename month_of_birth bmonth
	rename date_of_birth bday
	
* individual variables for dmy for interviewdate
	split date_survey, p("/" "-") gen(dmy_survey)
	destring dmy_survey*, replace

	rename dmy_survey1 sday
	rename dmy_survey2 smonth
	rename dmy_survey3 syear
	
/* some survey years are 1914. I assume that it should be 2014. Survey years 13
		and 14 are assumed to be 2013 and 2014. 
*/
	replace syear = 2014 if syear == 1914 | syear == 14
	replace syear = 2013 if syear == 13

	gen age3 = syear - byear if ((smonth > bmonth | (smonth == bmonth & sday >= bday)) & syear > byear) & smonth != . & byear != .
	replace age3 = syear - byear - 1 if ((smonth < bmonth | (smonth == bmonth & sday < bday)) & syear > byear) & syear != . & bmonth != .
	replace age3 = 0 if syear == byear & (smonth > bmonth | (smonth == bmonth & sday >= bday)) & smonth != . & byear != .
	replace age3 = . if age3 > 110
	
* pregnancy status
	gen pregnant = 1 if gauna_perfor_not_perfor == 1
	replace pregnant = 0 if female == 0 | inlist(gauna_perfor_not_perfor, 2, 3)
	
* weight in kg
	gen weight = weight_in_kg
	replace weight = . if weight < 30 | weight > 160
	
* height in meters
	gen height = length_height_cm / 100
	replace height = . if height < 0.6 | height > 2.5
	
	
* Variables regarding the blood pressure
/* Remark: "broad" variables can't be created because diagnosis and treatment
	status are not included in this but in the COMB dataset
*/ 
	gen bpsyst_frst = bp_systolic if bp_systolic >= 70 & bp_systolic <= 240
	gen bpsyst_sec =  bp_systolic_2_reading if bp_systolic_2_reading >= 70 &  bp_systolic_2_reading <= 240
	
	gen bpsyst_avg = (bpsyst_frst + bpsyst_sec) / 2 if bpsyst_frst != . & bpsyst_sec != .
	replace bpsyst_avg = bpsyst_frst if missing(bpsyst_sec)
	replace bpsyst_avg = bpsyst_sec if missing(bpsyst_frst)	
	
/* Remark: the lowest value of the diastolic readings is 40. This variable 
	obviously has already been cleaned. In the DLHS data we set our cutoff at 30.
*/
	gen bpdiast_frst = bp_diastolic if bp_diastolic >= 30 & bp_diastolic <= 120
	gen bpdiast_sec = bp_diastolic_2reading if bp_diastolic_2reading >= 30 & bp_diastolic_2reading <= 120
	
	gen bpdiast_avg = (bpdiast_frst + bpdiast_sec) / 2 if bpdiast_frst != . & bpdiast_sec != .
	replace bpdiast_avg = bpdiast_frst if missing(bpdiast_sec)
	replace bpdiast_avg = bpdiast_sec if missing(bpdiast_frst)
	
	gen htn_narrow_avg = 1 if (bpsyst_avg >= 140 & !missing(bpsyst_avg)) | (bpdiast_avg >= 90 & !missing(bpdiast_avg))
	replace htn_narrow_avg = 0 if bpsyst_avg < 140 & bpdiast_avg < 90
		
	gen htn_narrow_sec = 1 if (bpsyst_sec >= 140 & !missing(bpsyst_sec)) | (bpdiast_sec >= 90 & !missing(bpdiast_sec))
	replace htn_narrow_sec  = 0 if bpsyst_sec < 140 & bpdiast_sec < 90

	gen htn_stage2_avg = (bpsyst_avg >= 160 & !missing(bpsyst_avg)) | (bpdiast_avg >= 100 & !missing(bpdiast_avg))
	replace htn_stage2_avg = . if bpsyst_avg == . & bpdiast_avg == .
		
	gen htn_stage2_sec = (bpsyst_sec >= 160 & !missing(bpsyst_sec)) | (bpdiast_sec >= 100 & !missing(bpdiast_sec))
	replace htn_stage2_sec = . if bpsyst_sec == . & bpdiast_sec == .
		
	gen htnurgency_avg = (bpsyst_avg >= 180 & !missing(bpsyst_avg)) | (bpdiast_avg >= 110 & !missing(bpdiast_avg))
	replace htnurgency_avg = . if bpsyst_avg == . & bpdiast_avg == .
	
	gen htnurgency_sec = (bpsyst_sec >= 180 & !missing(bpsyst_sec)) | (bpdiast_sec >= 110 & !missing(bpdiast_sec))
	replace htnurgency_sec = . if bpsyst_sec == . & bpdiast_sec == .


* variables regarding the glucose level
/* Remark: This is the FASTING glucose level. In the DLHS data we also have
	not-fasting glucose level
	Furthermore, the lowest value is 50 and the highest 400 (hence, this variable
	has already been cleaned). In the DLHS data we set the cutoffs at 40 and 600. 
*/
	gen glucose = fasting_blood_glucose_mg_dl if fasting_blood_glucose_mg_dl > 0
* converting capillary into venous blood glucose level
	replace glucose = glucose * 1.11
		
	gen glucgrt200 = 1 if glucose >= 200 & glucose != .
	replace glucgrt200 = 0 if glucose < 200
		
	gen diab_narrow = 1 if glucose >= 126 & glucose != .
	replace diab_narrow = 0 if glucose < 126
	
* hemoglobin and anaemia variables
	gen hemoglobin = haemoglobin_level
	replace hemoglobin = . if hemoglobin < 5 | hemoglobin > 22
	
* anemia variables
/* The cut-offs are defined according to the WHO http://www.who.int/vmnis/indicators/haemoglobin/en/
	The values will be set missing for individuals <18y and pregnant women.
			However, for completeness (and to be able to check plausibility), 
			the code will be kept here.
*/
	gen anaemia = 0
	replace anaemia = 1 if hemoglobin < 11 & age1 < 5 
	replace anaemia = 1 if hemoglobin < 11.5 & age1 >= 5 & age1 < 12
	replace anaemia = 1 if hemoglobin < 12 & age1 >= 12 & age1 < 15
	replace anaemia = 1 if hemoglobin < 12 & female == 1 & pregnant == 0 & age1 >= 15 & age1 != .
	replace anaemia = 1 if hemoglobin < 11 & female == 1 & pregnant != 0 & age1 >= 15 & age1 != .
	replace anaemia = 1 if hemoglobin < 13 & female == 0 & age1 >= 15 & age1 != .

	
* severe anaemia
	gen sevanaemia = 0
	replace sevanaemia = 1 if hemoglobin < 7 & age1 < 5 
	replace sevanaemia = 1 if hemoglobin < 8 & age1 >= 5 & age1 < 15
	replace sevanaemia = 1 if hemoglobin < 8 & female == 1 & pregnant == 0 & age1 >= 15 & age1 != .
	replace sevanaemia = 1 if hemoglobin < 7 & female == 1 & pregnant != 0 & age1 >= 15 & age1 != .
	replace sevanaemia = 1 if hemoglobin < 8 & female == 0 & age1 >= 15 & age1 != .
	replace sevanaemia = . if hemoglobin == . | age1 == .
	replace sevanaemia = . if age1 >= 15 & female == .


	

* set observations for pregnant women and individuals < 18y missing

*** clean height, weight, blood pressure & blood glucose ***
/* 
Set values for these variables missing for pregnant women. 
In this step I will also set these values missing for individuals who are younger than 18.
*/
	gen dropage = (age1 < 18)
		
	foreach var of varlist glucose glucgrt200 diab_narrow bpsyst_frst bpsyst_avg bpdiast_frst ///
		bpdiast_avg htn_narrow_avg htn_narrow_sec htn_stage2_avg ///
		htn_stage2_sec htnurgency_avg htnurgency_sec height weight {
		 
		replace `var' = . if pregnant == 1 | dropage == 1
	}
	
* Body Mass Index
	gen BMI = weight/height^2 if weight != . & height != .
	replace BMI = . if BMI < 12 | BMI > 80
		
	gen overweight = (BMI > 25 & BMI <=30)
	replace overweight = . if BMI == .
		
	gen obese = (BMI > 30 & BMI != .)
	replace obese = . if BMI == .
		
	gen BMIgrt25 = (BMI > 25 & BMI != .)
	replace BMIgrt25 = . if BMI == .
		
	
/*gen variable indicating that these observations are from the AHS, in case we
	later append DLHS and AHS */
	
	gen DLHS = 0
}	
{ /*** LABEL VARIABLES ***/
	lab def state1 5 "Uttarakhand" 8 "Rajasthan" 9 "Uttar Pradesh" 10 "Bihar" 18 "Assam" ///
	20 "Jharkhand" 21 "Odisha" 22 "Chhattisgarh" 23 "Madhya Pradesh"
	lab val state state1
		lab def rural1 0 "urban" 1 "rural"
		lab val rural rural1	
	lab def female1 0 "male" 1 "female"
	lab val female female1
		lab def yesno 0 "no" 1 "yes"
		lab val pregnant yesno
		lab val htn_narrow_avg yesno
		lab val htn_narrow_sec yesno
		lab val htn_stage2_avg yesno
		lab val htn_stage2_sec yesno
		lab val htnurgency_avg yesno
		lab val htnurgency_sec yesno
		lab val glucgrt200 yesno
		lab val diab_narrow yesno
		lab val dropage yesno
		lab val overweight yesno
		lab val obese yesno
		lab val BMIgrt25 yesno
		lab val DLHS yesno
	
	lab var rural "location of household"
	lab var female "individual is female"
	lab var age1 "age on survey date based on indicated d|m|y"	
	lab var age3 "age on survey day based on DOB"
	lab var pregnant "individual is pregnant"
	lab var weight "weight in kgs"
	lab var height "height in meters"
	lab var bpsyst_frst "first reading systolic cleaned"
	lab var bpsyst_sec "second reading systolic cleaned"
	lab var bpsyst_avg "average of readings systolic"
	lab var bpdiast_frst "first reading diastolic cleaned"
	lab var bpdiast_sec "second reading diastolic cleaned"
	lab var bpdiast_avg "average of readings diastolic"
	lab var htn_narrow_avg "suffers from hypertension, avg. reading"
	lab var htn_narrow_sec "suffers from hypertension, second reading"
	lab var htn_stage2_avg "second stage hypertension, average reading"
	lab var htn_stage2_sec "second stage hypertension, second reading"
	lab var htnurgency_avg "hypertensive urgency, average reading"
	lab var htnurgency_sec "hypertensive urgency, second reading"
	lab var glucose "fasting venous blood sugar level"
	lab var glucgrt200 "fasting venous blood sugar level above 200"
	lab var diab_narrow  "suffers from diabetes according to test result"
	lab var dropage "under 18; base: age1"
	lab var BMI "Body Mass Index"
	lab var overweight "indiv is overweight"
	lab var obese "indiv is obese"
	lab var BMIgrt25 "BMI >25"
	lab var DLHS "data from DLHS dataset"


	save "$final\AHS_cab_cleaned",replace
}
	
{ /*** STATEWISE CAB DATA SETS ***/
	foreach s of numlist 5 8 9 10 18 20 21 22 23 {
		use "$final\AHS_cab_cleaned", clear
		keep if state == `s'
		save "$temp\CAB_state`s'", replace
	}
}


********************************************************************************
*********************** CLEANING THE AHS COMB DATASET **************************
********************************************************************************
{ /*** BASIC CLEANING OF STATE DATA SETS ***/

foreach s of numlist 5 8 9 10 18 20 21 22 23 {
	use "$rawcomb\`s'", clear
	di "state `s'"
	keep if year == 3
	
* only keep usual residents
	drop if usual_residance == 2
	
	destring residancial_status, replace
	
	keep if inlist(residancial_status, 1,3,4,5,6,.)
	drop if usual_residance == . & residancial_status== .
	
	
	
	foreach var of varlist injury_treatment_type treatment_source ///
		symptoms_pertaining_illness sought_medical_care diagnosed_for regular_treatment ///
		status hh_expall_status client_hl_id serial_no house_status building_no ///
		house_structure owner_status toilet_used is_toilet_shared ///
		household_have_electricity lighting_source cooking_fuel no_of_dwelling_rooms ///
		kitchen_availability is_radio is_television is_computer is_telephone ///
		is_washing_machine is_refrigerator is_sewing_machine is_bicycle is_car ///
		is_scooter is_tractor is_water_pump cart land_possessed wt  drinking_water_source ///
		sex usual_residance date_of_birth month_of_birth year_of_birth age religion ///
		marital_status currently_dead_or_out_migrated highest_qualification hh_id {
		
	destring `var', replace
	}
	
	
	
	if `s' == 5 {
		di "basic cleaning state `s'"
		
		replace religion = . if religion == 25
		replace owner_status = . if owner_status < 1	
	}
		
	if `s' == 8 {
		di "basic cleaning state `s'"
		
		replace is_toilet_shared = . if is_toilet_shared > 2
		replace owner_status = . if owner_status > 3
		replace is_scooter = . if is_scooter == 22
		replace is_car = . if is_car == 3
	}
	
	sort psu_id // is unique identifier

* married
	di "MARRIED"
	
	gen married = inlist(marital_status, 2,3,4)
	replace married = . if marital_status == 8 | marital_status == .
	lab var married "person is married"
	lab val married yesno
	
* education
	di "EDUC"
	
	gen educ = 1 if highest_qualification <= 2
	replace educ = 2 if highest_qualification == 3
	replace educ = 3 if highest_qualification == 4
	replace educ = 4 if highest_qualification == 5 
	replace educ = 5 if highest_qualification == 6
	replace educ = 6 if highest_qualification <= 9 & highest_qualification >= 7
	lab var educ "educational attainment"
	lab def educ1 1 "<Primary" 2 "Primary" 3 "Middle" 4 "Secondary" 5 "High School" ///
		6 ">High School"
	lab val educ educ1
	
* diagnosis of diabetes and hypertension and regular treatment
	di "DIAB HTN"
	
	gen diab_diagnosed = (diagnosed_for == 1)
	lab var diab_diagnosed "was diabetes diagnosed"
	lab val diab_diagnosed yesno
	
	gen diab_treated = (diab_diagnosed == 1 & regular_treatment == 2)
	lab var diab_treated "regular treatment of diabetes"
	lab val diab_treated yesno
		
	gen htn_diagnosed = (diagnosed_for == 2)
	lab var htn_diagnosed "was hypertension diagnosed"
	lab val htn_diagnosed yesno
	
	gen htn_treated = (htn_diagnosed == 1 & regular_treatment == 2)
	lab var htn_treated "regular treatment of hypertension"
	lab val htn_treated yesno

 /*** CALCULATE HOUSEHOLD SIZE AND COMPOSITION FOR WAVE 3 ***/
 
	*duplicates drop
	sort fid // fid is our household identifier
	
*** generate a Household ID ***
	mdesc fid
	tostring fid, replace format(%15.0f)
	gen hhid = fid
	destring hhid fid, replace
	format hhid fid %15.0g
	drop if hhid == .

*** Household size ***
	sort hhid
	by hhid: egen hhsize = count(hhid)

*** number of adults and children per household ***
/* Problem: The age variable we have here is different to the one from the CAB 
	data set. However, we have no choice but to use this variable to determine the
	number of children and adults. Reason: we will be losing some household members
	during the merging process. Hence, we will not be able to determine the household
	composition with the merged data set
*/
	tab age, m
	replace age = 0 if age < 0
	gen child = (age < 18)
	gen adult = (age > 17) if age != .
	lab var child "indiv is younger than 18, base: age in COMB"
	lab var adult "indiv is 18+, base: age in COMB"
	
	by hhid: egen childhh = total(child)
	by hhid: egen adulthh = total(adult)
	lab var childhh "total no. of children in HH"
	lab var adulthh "total no. of adults in HH"
	
/* adulthh and childhh might be used for equivalence scales or mortality rates. 
	In order to avoid mistakes, the numbers of children and adults are set to 
	missing for households where at least one HH member has a missing value in age.
*/
	by hhid: egen missage = max(missing(age))
	replace childhh = . if missage == 1
	replace adulthh = . if missage == 1
	drop missage
	
	rename age age_COMB
	lab var age_COMB "completed years in COMB data set"
	
*** number of males and females per household ***
	tab sex, m
	destring sex, replace
	replace sex = . if sex != 1 & sex != 2
	
	gen female = (sex == 2)
	by hhid: egen femalehh = total(female)
	
	gen male = (female == 0)
	by hhid: egen malehh = total(male)
	
	by hhid: egen misssex = max(missing(female))
	replace malehh = . if misssex == 1
	replace female = . if misssex == 1
	
	drop misssex male female

	lab var femalehh "number of female household members"
	lab var malehh "number of male household members"
	
	save "$temp\COMB`s'_hhcomposition.dta", replace
}
}

********************************************************************************
************************** MORT AND WOMAN DATA SETS ****************************
********************************************************************************
{ /*** MORT DATA SET ***/
/* In the MORT data set we have three variables that indicate the age of the 
	individual when it died. We will extract data on infant and child mortality
	and later merge it with the COMB data set. From the COMB data set we will 
	obtain the household sizes with which we then can calculate the mortality
	rates for the households.
	
	The ususal residents variables refer to the HH head. Hence, they are not
		used for dropping variables	
*/
foreach s of numlist 5 8 9 10 18 20 21 22 23 {
	use "$rawmort\`s'", clear
	di "state `s'"
	keep if year == 3
	duplicates drop
	keep state district age_of_death_below_one_month age_of_death_below_eleven_month ///
		age_of_death_above_one_year fid age year_of_birth house_no house_hold_no
		
	foreach v in age_of_death_below_one_month age_of_death_below_eleven_month ///
			age_of_death_above_one_year fid age year_of_birth house_no house_hold_no {
		destring `v', replace
	}
	rename age_of_death_below_one_month b1m
	rename age_of_death_below_eleven_month b11m
	rename age_of_death_above_one_year a1y
	lab var b1m "age at death, below 1 month, MORT"
	lab var b11m "age at death, below 11 months, MORT"
	lab var a1y "age at death, above 1 year, MORT"
	
	replace a1y = . if a1y > 110
	replace b11m = . if b11m > 11
	
	di "implausible b1m"
	count if (b1m > 30 | b1m < 0) & b1m != .
	di "implausible b11m"
	count if (b11m > 11 |  b1m < 0) & b11m != .
	di "implausible a1y"
	count if a1y < 0

	mdesc fid // fid is never missing
	destring fid, replace
	rename fid hhid
	lab var hhid "household identifier"
	sort hhid
	format hhid %15.0g
	
	/* there are some invalid household IDs */	
	drop if inlist(hhid, 1020000000000, 1030000000000, 1040000000000) & `s' == 10
	drop if inlist(hhid, 1801000000000, 1801010000000, 1801020000000, 1801030000000) & `s' == 18
	drop if inlist(hhid, 2000000000000, 2010000000000) & `s' == 20
	drop if inlist(hhid, 2301000000000, 2301020000000, 2301030000000) & `s' == 23

	
/* Generate a valid hhid for the observations that have an invalid fid variable
	but that contains the PSU. For this we use the first 7 digits from the fid variable
	(state+district+psu) and then use house_no (4 digits) and house_hold_no (2 digits).
	This is how the fid variables are constructed in the CAB and COMB data set
*/
	** State 8 ***
/* around 40 values of fid/hhid can be corrected as they contain the PSU.
*/
di "CORRECTION OF HHID IN STATE 8"
	if `s' == 8 {
		tostring hhid, replace format(%20.0g)	
		gen hhid_short = substr(hhid, 1,6)
		destring hhid_short, replace
		gen hhid2 = string(hhid_short, "%06.0f") + string(house_no, "%04.0f") + string(house_hold_no, "%02.0f")
		destring hhid hhid2, replace
		
	* Check whether this generates the same hhid for all other observations
		# delimit ;
		count if (hhid != hhid2) & !inlist(hhid, 808003000000, 808004000000
			, 808007000000 , 808007000000 , 808016000000 , 808018000000
			, 808022000000 , 808025000000 , 808027000000 , 808029000000
			, 808001000000 , 808002000000 , 808005000000 , 808041000000
			, 808006000000 , 808008000000 , 808009000000 , 808010000000
			, 808011000000 , 808012000000 , 808013000000 , 808014000000
			, 808015000000 , 808017000000 , 808019000000 , 808020000000
			, 808021000000 , 808023000000 , 808024000000 , 808026000000
			, 808028000000 , 808030000000 , 808031000000 , 808032000000
			, 808033000000 , 808034000000 , 808035000000 , 808036000000
			, 808037000000 , 808038000000 , 808039000000 , 808040000000) ;
		# delimit cr // there are no observations --> it worked
			
		# delimit ;
		replace hhid = hhid2 if inlist(hhid, 808003000000, 808004000000
			, 808007000000 , 808007000000 , 808016000000 , 808018000000
			, 808022000000 , 808025000000 , 808027000000 , 808029000000
			, 808001000000 , 808002000000 , 808005000000 , 808041000000
			, 808006000000 , 808008000000 , 808009000000 , 808010000000
			, 808011000000 , 808012000000 , 808013000000 , 808014000000
			, 808015000000 , 808017000000 , 808019000000 , 808020000000
			, 808021000000 , 808023000000 , 808024000000 , 808026000000
			, 808028000000 , 808030000000 , 808031000000 , 808032000000
			, 808033000000 , 808034000000 , 808035000000 , 808036000000
			, 808037000000 , 808038000000 , 808039000000 , 808040000000) ;
		# delimit cr
		
		drop hhid2 hhid_short 
	}
	** State 23 **
* 2 HHIDs contain the PSU. For these we can construct proper HH IDs
	
di "CORRECTION OF HHID IN STATE 23"	
	if `s' == 23 {
			tostring hhid, replace format(%20.0g)	
			gen hhid_short = substr(hhid, 1,7)
			destring hhid_short, replace
			gen hhid2 = string(hhid_short, "%07.0f") + string(house_no, "%04.0f") + string(house_hold_no, "%02.0f")

		* Check whether this generates the same hhid for all other observations
			count if hhid != hhid2 & hhid != "2301010000000" & hhid != "2301040000000" // yes, it does.
			replace hhid = hhid2 if hhid == "2301010000000" | hhid == "2301040000000"
			destring hhid, replace
			drop hhid2 hhid_short
	}
	
/* 0 and missings seem to have been used interchangeably in b1m, b11m, and a1y. 
	Decision rule: if b1m = 0 and b11m = a1y = . then the individual is believed
			to have died on their first day. The same rule applies if b11m = 0 and b1m = a1y = .
	When both of the other variables have the value of 0, then the mortality 
	variables will be coded as missings (this only ocurrs in states 5 and 9).
	If there are missings in all three variables, the mortality variables are coded
		as missings.
	I looked at the "age" variable to cross-check the assumptions. However, the age 
		variable does not show the age of the individual but the head
		of household.
*/	
	di "generate infant death"
	gen infant_death = ((b1m != . & b1m != 0) | (b11m != . & b11m != 0))
	replace infant_death = . if (b1m == 0| b1m == .) & (b11m == 0 | b11m == .) & (a1y == 0 | a1y == .)
	replace infant_death = 1 if b1m == 0 & b11m == . & a1y == .
	replace infant_death = 1 if b11m == 0 & b1m == . & a1y == .
	lab var infant_death "child was younger than 1y when deceasing, MORT data"
	drop if infant_death == .

	di "generate child death"
	gen u5_death = (infant_death == 1 | a1y < 5)
	replace u5_death = . if infant_death == .
	lab var u5_death "child was younger than 5y when deceasing, MORT data"
	
	di "generate total death"
	sort hhid
	by hhid: egen total_infdeath = total(infant_death)
	lab var total_infdeath "total number of deceased infants in HH, MORT data"
	by hhid: egen total_u5death = total(u5_death)
	lab var total_u5death "total number of deceased children in HH, MORT data"

	di "only keep 1 obs per HH"
	bysort hhid: gen hhno = 1 if _n == 1
	drop if hhno != 1
	drop hhno age year_of_birth house_no house_hold_no
	
/* generate a variable that this information was obtained from MORT data set.
	Later, after having merged this information with the COMB data set we will 
	have to set the values of the mortality variables "0" but the missings 
	created here have to remain missings
*/
	gen MORT = 1
	lab var MORT "data from AHS MORT data set"
		
save "$temp\MORT_mortality_`s'", replace
}


foreach s of numlist 5 8 9 10 18 20 21 22 {
	append using "$temp\MORT_mortality_`s'"
}
save "$final\MORT_mortality_appended", replace
}

{ /*** WOMAN DATA SET ***/
/* in the WOMAN data set we have information on a woman's live births and how
	many of these children have survived. Unfortunately there is no information
	about the age at which the children died.
	The number of live births and deaths will be aggregated for all women living
	in the same household and a ratio will be calculated at the household level.
	We will not need any COMB data for that but merge our obtained mortality variable
	with the COMB data.
*/

foreach s of numlist 5 8 9 10 18 20 21 22 23 {
use "$rawwom\`s'", clear

/* drop women that are not usual residents; the residents variables refer to the
	woman herself
*/

	destring usual_residance, replace
	drop if usual_residance == 2
	destring residancial_status, replace
	keep if inlist(residancial_status, 1,3,4,5,6,.)
	drop if usual_residance == . & residancial_status== .
	
* other preparations
	keep if year == 3
		
	duplicates drop
	
	keep state district delivered_any_baby born_alive_female born_alive_male ///
		born_alive_total surviving_female surviving_male surviving_total ///
		marital_status fid fidh age year_of_birth
	foreach v in state district delivered_any_baby born_alive_female born_alive_male ///
		born_alive_total surviving_female surviving_male surviving_total ///
		marital_status fid fidh age year_of_birth {
		destring `v', replace
	}
	
	rename (marital_status delivered_any_baby born_alive_female born_alive_male born_alive_total surviving_female surviving_male surviving_total) ///
		(married delivered bfem bmale btot_temp sfem smale stot_temp)
		

/* only keep the women who have delivered. There are also women who have a missing
	value in delivered. Of these women almost 100% have either a missing or "0"
	in the birth variables. Hence, we will also drop them.
*/
	di "CHECK THAT IF DELIVERED = . BIRTH VARIABLES = 0 | ."
	tab bfem if delivered == ., m
	tab bmale if delivered == ., m
	
	keep if delivered == 1
		
/* there are hardly any observations with a missing in bfem, bmale, sfem, and smale
	we will treat them as missings.
*/
	di "CHECK MISSINGS IN BIRTH AND SURVIVED VARIABLES"
	mdesc bfem bmale sfem smale
	
* drop observations that have missing in all four variables (the numbers refer to data set that includes all states)
	drop if bfem == . & bmale == . & sfem == . & smale == . // 16 obs were deleted
* drop observations that have more survivors than births
	drop if bfem < sfem & sfem != . // 101 obs deleted
	drop if bmale < smale & smale != . // 50 obs deleted
	
* assumption: we can replace bfem = . by "0" if sfem = 0
	replace bfem = 0 if sfem == 0 & bfem == . // 566 changes made
	replace bmale = 0 if smale == 0 & bmale == . // 185 changes made
	
/* replace missing birth variables with "0" if other gender birth variable is not
	missing */
	replace bfem = 0 if bmale != . & bfem == . // 1191 changes made
	replace bmale = 0 if bfem != . & bmale == . // 909 changes made
	
* replace survivor variables = 0 if they are missing and birth variables are 0
	replace sfem = 0 if sfem == . & bfem == 0 // 1887 changes made
	replace smale = 0 if smale == . & bmale == 0 // 1217 changes made
	
/* drop the observations that still have missings in survivor variables because
	there is no way to find out where they come from and we cannot use them
	to calculate proper death rates

	this is the case for 0.01% observations in the national data set
*/
	drop if sfem == . | smale == .
	
	
* the "total" variables will be dropped and new ones will be created
	drop stot_temp btot_temp
	
	gen btot = bfem + bmale
	drop if btot == 0
	lab var btot "total number of live births, WOMAN"
	
	gen stot = sfem + smale
	lab var btot "total number of surviving children, WOMAN"
	
* variable for total number of deceased children per woman
	gen dtot = btot - stot if btot != . & stot != .
	lab var dtot "total number of deceased children, WOMAN"
	
	
/* for states 5, 9, and 21 there are some missings in our household identifier fid. 
		However, fidh has no missings and we can obtain fid from fidh.
	States 5 and 9 have 12 digit fid and state 21 has a 13 digit fid.
	The remaining states have no missings in fid.	
*/
	di "CHECK MISSINGS IN FID AND FIDH"
	mdesc fid fidh
	if `s' < 10 {
		tostring fidh, replace format(%20.0g)
		gen fid2 = substr(fidh, 1,12)
		destring fid2 fidh, replace
		format fid fid2 %20.0g
		di "INCONSISTENCIES IN FID FIDH"
		count if fid != fid2 & fid != .
		rename fid2 hhid
		sort hhid
	}
	if `s' >= 10 & `s' != 21 {
		di "rename fid hhid"
		rename fid hhid
		sort hhid
	}
	if `s' == 21 {
		tostring fidh, replace format(%20.0g)
		gen fid2 = substr(fidh, 1,13)
		destring fid2 fidh, replace
		format fid fid2 %20.0g
		di "INCONSISTENCIES IN FID FIDH"
		count if fid != fid2 & fid != .
		rename fid2 hhid
		sort hhid
	}
	drop if hhid == .
	
/* we will now count the total number of dead children and live births
	on the household level
*/
	di "generate total live births and deceased"
	by hhid: egen total_birth = total(btot)
	lab var total_birth "total number of live births in HH, WOMAN data"
	by hhid: egen total_death = total(dtot)
	lab var total_death "total number of deceased children in HH, WOMAN data"
	
	di "only keep 1 obs per HH"
	bysort hhid: gen hhno = 1 if _n == 1
	drop if hhno != 1
	drop hhno age married delivered bfem bmale sfem smale fid year_of_birth fidh stot
		
	gen mort_women = total_death/total_birth
	lab var mort_women "mortality ratio; total deaths/total births; not age specific"
	
	gen WOMAN = 1
	lab var WOMAN "data from AHS WOMAN data set"
	
	drop btot dtot total_birth total_death
	
save "$temp\WOMAN_mortality_`s'", replace
}

foreach s of numlist 5 8 9 10 18 20 21 22 {
	append using "$temp\WOMAN_mortality_`s'"
}

save "$final\WOMAN_mortality_appended", replace

}


{ /*** MERGING THE OBTAINED DATA WITH COMB WAVE 3 DATA ***/

foreach s of numlist 5 8 9 10 18 20 21 22 23 {

	use "$temp\COMB`s'_hhcomposition", clear
	merge m:1 hhid using "$temp\MORT_mortality_`s'"

	drop if _merge == 2 // drop unmatched observations from MORT data
	drop _merge
	
/* HH that were not in MORT and WOMAN data will have missings in total_infdeath and
	total_u5death. These values will be kept as missings in order to be able
	to distinguish them from the HHs where no child died.
*/
	replace MORT = 0 if MORT == .

	merge m:1 hhid using "$temp\WOMAN_mortality_`s'"
	drop if _merge == 2 // drop unmatched observations from WOMAN data
	drop _merge
	replace mort_women = 0 if mort_women == .

/*	
	
/* The reference period is 01.01.2011 - 31.01.2011. Hence, the reference period
	has a length of 365.25 days.
*/
	gen refdays = 365.25
	lab var refdays "number of days in reference period"

* end of reference period
	gen endrefdate_1 = "2011/12/31"
	gen double endrefdate_2 = clock(endrefdate_1, "YMD")
	gen refend = dofc(endrefdate_2)
	format refend %td
	drop endrefdate*
	lab var refend "reference period ended on 31/12/2011"

* birth date
	di "BIRTH DATE"
	tab date_of_birth, m
	di "gen birth date variable"
	gen bday = date_of_birth
	replace bday = . if bday > 31 | bday == 0
	tab bday, m
	
	di "BIRTH MONTH"
	tab month_of_birth, m
	di "gen birth month variable"
	gen bmonth = month_of_birth
	replace bmonth = . if bmonth > 12 | bday == 0
	tab bday, m
	
	di "BIRTH YEAR"
	tab year_of_birth, m
	di "gen birth year variable"
	gen byear = year_of_birth
	replace byear = . if byear > 2011
	
*/
save "$final\CMW`s'_w3", replace
}
}
